First, we want to use the PCA script which uses plink to run a PCA on WGS data vcf file.

For this example I want to run a PCA from the vcf file we recieved from Drew.

File name: /dss/dsslegfs01/pr53da/pr53da-dss-0034/vcf/Safran_lab/hirundo.rustica.300.id.allsites.renamed.miss02.maf05.ingroup.vcf.gz

We obtain the eigenvectors/eigenvalues using the script: 1_PCA_WGS.sh

Plotting PCA axes

We can set up a function that uses ggplot in order to plot the PCA from the eigenvalues and eigenvector files, while also pulling information from the metadata, which gives us information on the subspecies/population information. This can be altered to use shape as well to look at two different metadata columns at once (at the moment I only use color).

In this section, where the tag’sub’ is refers to subspecies,so if you would like to plot rather by population, the colour= needs to be set to ‘pop’ and the color_sub needs to be set to color_pop and needs to reflect the number of populations present.

The pop.map file is a text file which has three columns ind (individual name matching the vcf file), pop (corresponding pop of that individual), sub(corresponding subspecies of that individual). This is a tab separated file.

# def functions --------
plot.pca <- function(data,X,Y, colors, plot_title){
    p <- ggplot(data = data, aes_string(x=X, y=Y, fill= "sub", color = "sub", label="ind",shape="pop")) + 
      geom_point(size=4) +
      theme_bw() +
      scale_fill_manual(values=colors, na.translate = FALSE) +
      scale_color_manual(values=c("grey33", "#00316e", "grey33", "grey33", "grey33", "grey33"), na.translate = FALSE) +
      #scale_shape_manual(values=c(3,21,7,22,8,23:25))+
      scale_shape_manual(values=c(3,21:25, 1, 2), na.translate = FALSE)+
      ylab(paste0(Y ,"(", signif(pve$pve[pve$PC==Y], 3), "%)")) +
      xlab(paste0(X ,"(", signif(pve$pve[pve$PC==X], 3), "%)")) +
      ylim(-0.3,0.2) +
      labs(fill="",shape="")  +
      ggtitle(plot_title) +
      theme(plot.title = element_text(hjust = 0.5),   # Center the title
      panel.grid.major = element_blank(), 
      panel.grid.minor = element_blank(),
      legend.position = "bottom",               # Move legend to the bottom
      legend.title = element_blank(), 
      legend.box="vertical", 
      legend.margin=margin()) +
      guides(
      color = FALSE,      
      fill = guide_legend(override.aes = list(shape=21), nrow = 1, byrow = TRUE),  
      shape = guide_legend(nrow = 1, byrow = TRUE)   
    )
    
    return(p)
}

##this looks at the eigenvalues and the percent of variance explained by each PC axis
plot.pve <- function(pve){
   pve.p <- ggplot(data=pve, aes(x=PC, y=pve)) +
    geom_bar(stat="identity", fill="#418979") + 
    theme_bw()  +
    ylab("Explained variation (%)") +
    xlab("Principal Component") +
    ggtitle("Scree plot - PCA ")+
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
  
  return(pve.p)
}

# def variables --------
eigenval <- "/dss/dsslegfs01/pr53da/pr53da-dss-0034/projects/2021_SwallowWGS_168/1_Population_Structure/1_PCA/3_85.SM/HR.83.eigenval"
eigenvec <- "/dss/dsslegfs01/pr53da/pr53da-dss-0034/projects/2021_SwallowWGS_168/1_Population_Structure/1_PCA/3_85.SM/HR.83.eigenvec"
pop <- read.table("/dss/dsslegfs01/pr53da/pr53da-dss-0034/projects/2021_SwallowWGS_168/1_Population_Structure/1_PCA/3_85.SM/HR.83.fam", header = FALSE, stringsAsFactors = FALSE)

# Separate the first column into multiple columns
pop <- pop %>%
  mutate(V1 = as.character(V1)) %>%
  tidyr::separate(V1, into = c("pop", "year", "sub", "id", "type", "age", "sex"), sep = "_") %>%
  dplyr::select(c("pop", "year", "sub", "id", "sex", "V2"))
# Convert year column to numeric
pop$year <- as.numeric(pop$year)


#Fix wrongly assigned individuals due to issues with sequencing (see emaisl with Drew Schield)
individuals_US <- c("US_02_E_2171-28178_BL_ADL_U", "US_02_E_2171-28179_BL_ADL_U", "US_02_E_3121-88841_BL_ADL_U")
individuals_EG <- c("EG_15_S_1602_BL_ADL_U", "EG_15_S_1605_BL_ADL_U", "EG_15_S_1609_BL_ADL_U", "EG_15_S_1611_BL_ADL_U")
# Update pop and sub columns based on individual names
pop <- pop %>%
  mutate(pop = if_else(V2 %in% individuals_US & pop == "US" & sub == "E", "EG", pop),
         sub = if_else(V2 %in% individuals_US & sub == "E", "S", sub),
         pop = if_else(V2 %in% individuals_EG & pop == "EG" & sub == "S", "US", pop),
         sub = if_else(V2 %in% individuals_EG & sub == "S", "E", sub))

#write.table(pop, "/dss/dsslegfs01/pr53da/pr53da-dss-0034/projects/2021_SwallowWGS_168/1_Population_Structure/1_PCA/3_85.SM/85_pop.txt", quote=F, row.names=F)
pop <- read.table("/dss/dsslegfs01/pr53da/pr53da-dss-0034/projects/2021_SwallowWGS_168/1_Population_Structure/1_PCA/3_85.SM/85_pop.txt", header=T)

color_sub <- c( "#39ad48", #E
                "#00316e", #G
                "#d30000", #R
                #"#9716a8", #RG
                #"#FFA500", #RT
                "#be6400", #S
                "#f4d000", #T
                "#622a0f", #TV
                "grey") #NA/unknown

#Get eigen values and vectors -----
pca <- read.table(eigenvec)
eigenval <- scan(eigenval)

# Clean-up data -------

# remove nuisance column
pca <- pca[,-1]
# set names
names(pca)[1] <- "ind"
names(pca)[2:ncol(pca)] <- paste0("PC", 1:(ncol(pca)-1))
# join pop data to pca results
pca <- dplyr::left_join(pca, pop,by = c('ind' = 'V2'))

Cumulative variance of components

Here we want to plot the amount of variance described by each PC axis. We do this by using the eigenvalues and converting this to a percentage of variance.

#Plot cumulative variance of components
pve <- data.frame(PC = paste0("PC", 1:(ncol(pca)-6)), pve = eigenval/sum(eigenval)*100)
pve$PC <- factor(pve$PC, levels=unique(pve$PC))
ggplotly(plot.pve(pve))
pve1 <- plot.pve(pve)

Plot PCA

We can make plots looking at the first 3 axes of the PCA.

#first define the colors you want to use, this needs to be equal to the number of populations or subspecies. For the moment the script is only set up to use color,but could be extended.

  


#Plot first 3 PCA combos 
p1 <- plot.pca(pca, "PC1","PC2", color_sub, "WGS")
p2 <- plot.pca(pca, "PC1","PC3", color_sub, "WGS")
p3 <- plot.pca(pca, "PC2","PC3", color_sub, "WGS")

p1

ggarrange(pve1,                                                 # First row with scatter plot
          ggarrange(p2, p3, ncol = 2, labels = c("B", "C")), # Second row with box and dot plots
          nrow = 2, 
          labels = "A"                                        # Labels of the scatter plot
          )